
```{r setup, include=FALSE}
library(tm)
library(dplyr)
library(lolog) 
library(network)
library(dotwhisker) 
library(magrittr)
library(gridExtra) 
library(quanteda.textstats)
library(quanteda) 
library(igraph)
library(stringr)
library(SentimentAnalysis)
```


```{r}
pdf_corpus <- Corpus(DirSource("02.19.09/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

df_c

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf # remove 1 to 1 comparisons from maximum contention

max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment <- read.csv("public_comments.csv")

comment$name <- tolower(comment$name)

comment_1 <- comment %>% 
  filter(comment_number == "1") %>% 
  arrange(name)

feb_19 <- cbind(comment_1, max_cos, sentiment)

feb_19_2009 <- feb_19

feb_19
```

```{r}
pdf_corpus <- Corpus(DirSource("10.01.13-1/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))
#values_to_remove <- 1
#avg_cos <- cos_sim[!cos_sim %in% values_to_remove]

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_3 <- comment %>% 
  filter(comment_number == "3") %>% 
  arrange(name)

october_1_13 <- cbind(comment_3, max_cos, sentiment)

october_1_13_1 <- october_1_13
```

```{r}
pdf_corpus <- Corpus(DirSource("10.01.13-2/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf

max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_2 <- comment %>% 
  filter(comment_number == "2") %>% 
  arrange(name)

october_1_13 <- cbind(comment_2, max_cos, sentiment)

october_1_13_2 <- october_1_13
```

```{r} 
pdf_files <- list.files("12.22.13/", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf

max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_4 <- comment %>% 
  filter(comment_number == "4") %>% 
  arrange(name)

december_22_13 <- cbind(comment_4, max_cos, sentiment)
```

```{r}
pdf_corpus <- Corpus(DirSource("01.15.14-1/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_5 <- comment %>% 
  filter(comment_number == "6") %>% 
  arrange(name)

jan_1_14_1 <- cbind(comment_5, max_cos, sentiment)

jan_1_14 <- jan_1_14_1
```

```{r}
pdf_corpus <- Corpus(DirSource("01.15.14-2/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_6 <- comment %>% 
  filter(comment_number == "5") %>% 
  arrange(name)

jan_1_14_2 <- cbind(comment_6, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("02.23.14", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))


diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_7 <- comment %>% 
  filter(comment_number == "7") %>% 
  arrange(name)

feb_23_14 <- cbind(comment_7, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("04.09.14", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_8 <- comment %>% 
  filter(comment_number == "8") %>% 
  arrange(name)

april_9_14 <- cbind(comment_8, max_cos, sentiment)
```

```{r}
pdf_corpus <- Corpus(DirSource("04.11.14/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_9 <- comment %>% 
  filter(comment_number == "9") %>% 
  arrange(name)

april_11 <- cbind(comment_9, max_cos, sentiment)

april_11_14 <- april_11
```

```{r}
pdf_files <- list.files("04.14.14", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_10 <- comment %>% 
  filter(comment_number == "10") %>% 
  arrange(name)

april_14 <- cbind(comment_10, max_cos, sentiment)

april_14_14 <- april_14
```

```{r}
# does not work; array error
pdf_corpus <- Corpus(DirSource("05.02.14/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))
diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_11 <- comment %>% 
  filter(date == "201452") %>% 
  arrange(name)

may_2 <- cbind(comment_11, max_cos, sentiment)

may_2_14 <- may_2 
```

```{r}
pdf_corpus <- Corpus(DirSource("09.19.14/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))
diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_12 <- comment %>% 
  filter(comment_number == "12") %>% 
  arrange(name)

sep_19 <- cbind(comment_12, max_cos, sentiment)

sep_19_14 <- sep_19
```

```{r}
pdf_files <- list.files("01.09.15-1", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)


df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_14 <- comment %>% 
  filter(comment_number == "14") %>% 
  arrange(name)

jan_9_15_1 <- cbind(comment_14, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("01.09.15-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_13 <- comment %>% 
  filter(comment_number == "13") %>% 
  arrange(name)

jan_9_15 <- cbind(comment_13, max_cos, sentiment)

jan_9_15_2 <- jan_9_15
```

```{r}
pdf_files <- list.files("01.14.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_15 <- comment %>% 
  filter(comment_number == "15") %>% 
  arrange(name)

jan_14_15 <- cbind(comment_15, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("01.16.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_16 <- comment %>% 
  filter(comment_number == "16") %>% 
  arrange(name)

jan_16_15 <- cbind(comment_16, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("02.06.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)


df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_20 <- comment %>% 
  filter(comment_number == "20") %>% 
  arrange(name)

feb_6_15_1 <- cbind(comment_20, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("02.06.15-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))
diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_19 <- comment %>% 
  filter(comment_number == "19") %>% 
  arrange(name)

feb_6_15_2 <- cbind(comment_19, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("02.06.15-3", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_18 <- comment %>% 
  filter(comment_number == "18") %>% 
  arrange(name)

feb_6_15_3 <- cbind(comment_18, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("02.06.15-4", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_17 <- comment %>% 
  filter(comment_number == "17") %>% 
  arrange(name)

feb_6_15_4 <- cbind(comment_17, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("02.20.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_21 <- comment %>% 
  filter(comment_number == "21") %>% 
  arrange(name)

feb_20_15 <- cbind(comment_21, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("04.30.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_22 <- comment %>% 
  filter(comment_number == "22") %>% 
  arrange(name)

april_30_15 <- cbind(comment_22, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("05.01.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)
df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_23 <- comment %>% 
  filter(comment_number == "23") %>% 
  arrange(name)

may_1_15 <- cbind(comment_23, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("05.08.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_24 <- comment %>% 
  filter(comment_number == "24") %>% 
  arrange(name)

may_8_15 <- cbind(comment_24, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("05.29.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_25 <- comment %>% 
  filter(comment_number == "25") %>% 
  arrange(name)

may_29_15 <- cbind(comment_25, max_cos, sentiment)
```



```{r}
pdf_files <- list.files("06.12.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_26 <- comment %>% 
  filter(comment_number == "26") %>% 
  arrange(name)

june_12_15 <- cbind(comment_26, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("06.17.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_27 <- comment %>% 
  filter(comment_number == "27") %>% 
  arrange(name)

june_17_15 <- cbind(comment_27, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("06.18.15", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_28 <- comment %>% 
  filter(comment_number == "28") %>% 
  arrange(name)

june_18_15 <- cbind(comment_28, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("04.01.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_48 <- comment %>% 
  filter(comment_number == "48") %>% 
  arrange(name)

april_1_16 <- cbind(comment_48, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("04.22.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)


df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_49 <- comment %>% 
  filter(comment_number == "49") %>% 
  arrange(name)

april_22_2016 <- cbind(comment_49, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("06.30.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_29 <- comment %>% 
  filter(comment_number == "29") %>% 
  arrange(name)

june_30_16 <- cbind(comment_29, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("08.16.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_50 <- comment %>% 
  filter(comment_number == "50") %>% 
  arrange(name)

aug_16_16 <- cbind(comment_50, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("08.16.16-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_51 <- comment %>% 
  filter(comment_number == "51") %>% 
  arrange(name)

aug_16_16_2 <- cbind(comment_51, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("09.05.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_52 <- comment %>% 
  filter(comment_number == "52") %>% 
  arrange(name)

sep_5_16 <- cbind(comment_52, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("09.05.16-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_53 <- comment %>% 
  filter(comment_number == "53") %>% 
  arrange(name)

sep_5_16_2 <- cbind(comment_53, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("09.08.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_54 <- comment %>% 
  filter(comment_number == "54") %>% 
  arrange(name)

sep_8_16 <- cbind(comment_54, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("09.19.16", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_55 <- comment %>% 
  filter(comment_number == "55") %>% 
  arrange(name)

sep_19_16 <- cbind(comment_55, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("02.03.17", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_56 <- comment %>% 
  filter(comment_number == "56") %>% 
  arrange(name)

feb_3_17 <- cbind(comment_56, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("06.30.17", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_57 <- comment %>% 
  filter(comment_number == "57") %>% 
  arrange(name)

june_30_17 <- cbind(comment_57, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("08.10.17", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_30 <- comment %>% 
  filter(comment_number == "30") %>% 
  arrange(name)

aug_10_17 <- cbind(comment_30, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("09.15.17-1", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
df
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_31 <- comment %>% 
  filter(comment_number == "31") %>% 
  arrange(name)

sep_15_17_1 <- cbind(comment_31, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("09.15.17-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_32 <- comment %>% 
  filter(comment_number == "32") %>% 
  arrange(name)

sep_19_17_2 <- cbind(comment_32, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("10.13.17", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_34 <- comment %>% 
  filter(comment_number == "34") %>% 
  arrange(name)

oct_13_17 <- cbind(comment_34, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("10.20.17", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_35 <- comment %>% 
  filter(comment_number == "35") %>% 
  arrange(name)

oct_20_17 <- cbind(comment_35, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("01.15.18", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_36 <- comment %>% 
  filter(comment_number == "36") %>% 
  arrange(name)

jan_15_18 <- cbind(comment_36, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("06.20.18-1", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_38 <- comment %>% 
  filter(comment_number == "38") %>% 
  arrange(name)

june_20_18_1 <- cbind(comment_38, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("06.20.18-2", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

pdf_files

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_37 <- comment %>% 
  filter(comment_number == "37") %>% 
  arrange(name)

june_20_18 <- cbind(comment_37, max_cos, sentiment)
```

```{r}
pdf_files <- list.files("07.06.18", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_58 <- comment %>% 
  filter(comment_number == "58") %>% 
  arrange(name)

july_6_18 <- cbind(comment_58, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("09.07.18", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_39 <- comment %>% 
  filter(comment_number == "39") %>% 
  arrange(name)

sep_7_18 <- cbind(comment_39, max_cos, sentiment)
```

```{r}
# array issues
pdf_corpus <- Corpus(DirSource("03.06.19/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_40 <- comment %>% 
  filter(date == "201936") %>% 
  arrange(name)

mar_6_19 <- cbind(comment_40, max_cos, sentiment)
```


```{r}
pdf_files <- list.files("05.10.19", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)

# Extract part numbers from filenames
part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))

# Create a custom sorting function
custom_sort <- function(files, parts) {
  order(parts)
}

# Sort the list of PDF files using the custom sorting function
pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]

# Create a corpus from the sorted PDF files
pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))

# Extract text from the corpus
pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

df_c

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_41 <- comment %>% 
  filter(date == "2019510") %>% 
  arrange(name)

may_10_19 <- cbind(comment_41, max_cos, sentiment)
```


```{r}
# lots of array issues
pdf_corpus <- Corpus(DirSource("11.12.19/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

df_c

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_42 <- comment %>% 
  filter(date == "20191112") %>% 
  arrange(name)

nov_12_19 <- cbind(comment_42, max_cos, sentiment)
```


```{r}
pdf_corpus <- Corpus(DirSource("12.02.19/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_43 <- comment %>% 
  filter(date == "2019122") %>% 
  arrange(name)

dec_2_19 <- cbind(comment_43, max_cos, sentiment)
```


```{r}
pdf_corpus <- Corpus(DirSource("03.06.20/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))


diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_44 <- comment %>% 
  filter(comment_number == "44") %>% 
  arrange(name)

march_6_2020 <- cbind(comment_44, max_cos, sentiment)
```


```{r}
# array issues
pdf_corpus <- Corpus(DirSource("04.13.20/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_45 <- comment %>% 
  filter(comment_number == "45") %>% 
  arrange(name)

april_13_2020 <- cbind(comment_45, max_cos, sentiment)
```


```{r}
pdf_corpus <- Corpus(DirSource("12.14.20/"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1) 

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- as.matrix(textstat_simil(toks_dfm,
                                              margin = "documents",
                                              method = "cosine"))

diag(cos_sim) <- -Inf
max_cos <- apply(cos_sim, 1, max, na.rm = TRUE)
sentiment <- analyzeSentiment(for_token)$SentimentQDAP
```

```{r}
comment_46 <- comment %>% 
  filter(comment_number == "46") %>% 
  arrange(name)

dec_14_2020 <- cbind(comment_46, max_cos, sentiment)
```

```{r}
final_text_analysis <- rbind(as.data.frame(feb_19_2009), as.data.frame(october_1_13_1), as.data.frame(october_1_13_2), as.data.frame(december_22_13), as.data.frame(jan_1_14),as.data.frame(jan_1_14_2), as.data.frame(feb_23_14), as.data.frame(april_9_14), as.data.frame(april_11_14), as.data.frame(april_14_14), as.data.frame(may_2_14), as.data.frame(sep_19_14),
as.data.frame(jan_9_15_1), as.data.frame(jan_9_15_2), as.data.frame(jan_14_15),
as.data.frame(jan_16_15), as.data.frame(feb_6_15_1), as.data.frame(feb_6_15_2),
as.data.frame(feb_6_15_3), as.data.frame(feb_6_15_4), as.data.frame(feb_20_15),
as.data.frame(april_30_15), as.data.frame(may_1_15), as.data.frame(may_8_15),
as.data.frame(may_29_15), as.data.frame(june_12_15), as.data.frame(june_17_15),
as.data.frame(june_18_15), as.data.frame(april_1_16), as.data.frame(april_22_2016),
as.data.frame(june_30_16), as.data.frame(aug_16_16), as.data.frame(aug_16_16_2),
as.data.frame(sep_5_16), as.data.frame(sep_5_16_2), as.data.frame(sep_8_16),
as.data.frame(sep_19_16), as.data.frame(feb_3_17), as.data.frame(june_30_17),
as.data.frame(aug_10_17), as.data.frame(sep_15_17_1), as.data.frame(sep_19_17_2),
as.data.frame(oct_13_17), as.data.frame(oct_20_17), as.data.frame(jan_15_18),
as.data.frame(june_20_18_1), as.data.frame(june_20_18), as.data.frame(july_6_18), 
as.data.frame(sep_7_18), as.data.frame(mar_6_19), as.data.frame(may_10_19), as.data.frame(nov_12_19),
as.data.frame(dec_2_19), as.data.frame(march_6_2020),
as.data.frame(april_13_2020), as.data.frame(dec_14_2020))

write.csv(final_text_analysis, "comments_max_cos.csv")
```

